You are now the data slave to the principal investigator Dr. Vinca Monster. Dr. M is in the Grape Program at State U, and you are just a poor graduate student trying to get your degree. Dr. M is interested in wine preferences and the influences of physico-chemical properties on preferences. Her laboratory has gathered an extensive dataset on Portugese white varietals.
You will find the white_wines.csv file and its description in my github repo (https://github.com/vhertzb/Regression-1)[https://github.com/vhertzb/Regression-1].
Please use the techniques you have learned in the last two classes, specifically exploratory data analysis and linear regression, to determine association of the wine properties on preference.
Prepare a report for presentation at the next Monster lab meeting about this dataset.
Rubric:
Exploration (summary statistics (the m’s), univariate graphs, multivariate graphs) Regression (Models explored, diagnostics completed, final model choice, justification)
Please include a concluding paragraph (or two) about the implications of your findings.
#load up necessary packages
library(HistData)
library(car)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2. http://CRAN.R-project.org/package=stargazer
#read in dataset
library(readr)
White_wines <- read_csv("~/Documents/Big Data Class/N741 Data Wrangling/InClass2.1/Regression-1/White_wines.csv")
## Parsed with column specification:
## cols(
## `fixed acidity` = col_double(),
## `volatile acidity` = col_double(),
## `citric acid` = col_double(),
## `residual sugar` = col_double(),
## chlorides = col_double(),
## `free sulfur dioxide` = col_double(),
## `total sulfur dioxide` = col_double(),
## density = col_double(),
## pH = col_double(),
## sulphates = col_double(),
## alcohol = col_double(),
## quality = col_integer()
## )
#Why did I have to do below?
names(White_wines) <- make.names(names(White_wines))
# see what is in the dataset
summary(White_wines)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
library(Rcmdr)
## Loading required package: splines
## Loading required package: RcmdrMisc
## Loading required package: sandwich
## The Commander GUI is launched only in interactive sessions
scatterplotMatrix(~quality+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE,
levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)
scatterplotMatrix(~quality+pH+density+free.sulfur.dioxide+chlorides,
reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE,
levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)
scatterplotMatrix(~quality+citric.acid+fixed.acidity+alcohol,
reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE,
levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)
Free sulfur dioxide, fixed acidity, and density seem to have abnormal residual distributions, so I logged them below.
White_wines$free.sulf.diox.log <- with(White_wines,
log2(free.sulfur.dioxide))
White_wines$fixed.acidity.log <- with(White_wines, log2(fixed.acidity))
White_wines$density.log <- with(White_wines, log2(density))
scatterplotMatrix(~quality+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE,
levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)
scatterplotMatrix(~quality+pH+density.log+free.sulf.diox.log+chlorides,
reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE,
levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)
scatterplotMatrix(~quality+citric.acid+fixed.acidity.log+alcohol,
reg.line=FALSE, smooth=TRUE, spread=FALSE, span=0.5, ellipse=FALSE,
levels=c(.5, .9), id.n=0, diagonal = 'histogram', data=White_wines)
Those transformations help a little bit.
cor(White_wines[,c("quality","alcohol","chlorides","citric.acid","density.log","fixed.acidity.log","free.sulf.diox.log","pH","residual.sugar","sulphates","total.sulfur.dioxide","volatile.acidity")], use="complete")
## quality alcohol chlorides citric.acid
## quality 1.000000000 0.43557472 -0.20993441 -0.009209091
## alcohol 0.435574715 1.00000000 -0.36018871 -0.075728730
## chlorides -0.209934411 -0.36018871 1.00000000 0.114364448
## citric.acid -0.009209091 -0.07572873 0.11436445 1.000000000
## density.log -0.307723788 -0.78135429 0.25757406 0.149442828
## fixed.acidity.log -0.109736681 -0.13108514 0.03305563 0.292566248
## free.sulf.diox.log 0.099058582 -0.22409197 0.09168564 0.084276395
## pH 0.099427246 0.12143210 -0.09043946 -0.163748211
## residual.sugar -0.097576829 -0.45063122 0.08868454 0.094211624
## sulphates 0.053677877 -0.01743277 0.01676288 0.062330940
## total.sulfur.dioxide -0.174737218 -0.44889210 0.19891030 0.121130798
## volatile.acidity -0.194722969 0.06771794 0.07051157 -0.149471811
## density.log fixed.acidity.log free.sulf.diox.log
## quality -0.30772379 -0.10973668 0.09905858
## alcohol -0.78135429 -0.13108514 -0.22409197
## chlorides 0.25757406 0.03305563 0.09168564
## citric.acid 0.14944283 0.29256625 0.08427640
## density.log 1.00000000 0.27695036 0.28317156
## fixed.acidity.log 0.27695036 1.00000000 -0.04534913
## free.sulf.diox.log 0.28317156 -0.04534913 1.00000000
## pH -0.09368819 -0.43478921 0.02199554
## residual.sugar 0.83864966 0.10237716 0.30293472
## sulphates 0.07444942 -0.01415546 0.06084248
## total.sulfur.dioxide 0.53044357 0.10259928 0.59619976
## volatile.acidity 0.02661505 -0.02974209 -0.11663198
## pH residual.sugar sulphates
## quality 0.099427246 -0.09757683 0.05367788
## alcohol 0.121432099 -0.45063122 -0.01743277
## chlorides -0.090439456 0.08868454 0.01676288
## citric.acid -0.163748211 0.09421162 0.06233094
## density.log -0.093688189 0.83864966 0.07444942
## fixed.acidity.log -0.434789207 0.10237716 -0.01415546
## free.sulf.diox.log 0.021995543 0.30293472 0.06084248
## pH 1.000000000 -0.19413345 0.15595150
## residual.sugar -0.194133454 1.00000000 -0.02666437
## sulphates 0.155951497 -0.02666437 1.00000000
## total.sulfur.dioxide 0.002320972 0.40143931 0.13456237
## volatile.acidity -0.031915368 0.06428606 -0.03572815
## total.sulfur.dioxide volatile.acidity
## quality -0.174737218 -0.19472297
## alcohol -0.448892102 0.06771794
## chlorides 0.198910300 0.07051157
## citric.acid 0.121130798 -0.14947181
## density.log 0.530443572 0.02661505
## fixed.acidity.log 0.102599278 -0.02974209
## free.sulf.diox.log 0.596199757 -0.11663198
## pH 0.002320972 -0.03191537
## residual.sugar 0.401439311 0.06428606
## sulphates 0.134562367 -0.03572815
## total.sulfur.dioxide 1.000000000 0.08926050
## volatile.acidity 0.089260504 1.00000000
The variable most correlated with quality is alcohol, so I will use that as my primary independent variable. Density is highly correlated with alcohol (r=-.78) and residual sugar (r=.84),and free sulfur dioxide less so with total sulfur dioxide (r=.59), so I will consider whether these variables all need to be in the model.
Regression with training data.
RegModel.1 <-
lm(quality~alcohol+chlorides+citric.acid+density.log+fixed.acidity.log+free.sulf.diox.log+pH+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
data=White_wines)
summary(RegModel.1)
##
## Call:
## lm(formula = quality ~ alcohol + chlorides + citric.acid + density.log +
## fixed.acidity.log + free.sulf.diox.log + pH + residual.sugar +
## sulphates + total.sulfur.dioxide + volatile.acidity, data = White_wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4172 -0.5008 -0.0287 0.4585 3.0836
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.249e+00 5.481e-01 -2.279 0.022726 *
## alcohol 1.976e-01 2.411e-02 8.199 3.07e-16 ***
## chlorides -4.037e-01 5.383e-01 -0.750 0.453354
## citric.acid 3.611e-03 9.443e-02 0.038 0.969494
## density.log -9.368e+01 1.311e+01 -7.144 1.04e-12 ***
## fixed.acidity.log 3.700e-01 9.982e-02 3.707 0.000212 ***
## free.sulf.diox.log 2.163e-01 1.780e-02 12.155 < 2e-16 ***
## pH 6.609e-01 1.052e-01 6.285 3.57e-10 ***
## residual.sugar 7.305e-02 7.478e-03 9.768 < 2e-16 ***
## sulphates 6.300e-01 9.898e-02 6.364 2.14e-10 ***
## total.sulfur.dioxide -1.901e-03 3.695e-04 -5.146 2.76e-07 ***
## volatile.acidity -1.651e+00 1.130e-01 -14.615 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7412 on 4886 degrees of freedom
## Multiple R-squared: 0.3012, Adjusted R-squared: 0.2996
## F-statistic: 191.4 on 11 and 4886 DF, p-value: < 2.2e-16
Interpretation: Density standard error is very high.
RegModel.1.2 <-
lm(quality~alcohol+chlorides+citric.acid+density+fixed.acidity+free.sulfur.dioxide+pH+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
data=White_wines)
summary(RegModel.1.2)
##
## Call:
## lm(formula = quality ~ alcohol + chlorides + citric.acid + density +
## fixed.acidity + free.sulfur.dioxide + pH + residual.sugar +
## sulphates + total.sulfur.dioxide + volatile.acidity, data = White_wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8348 -0.4934 -0.0379 0.4637 3.1143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.502e+02 1.880e+01 7.987 1.71e-15 ***
## alcohol 1.935e-01 2.422e-02 7.988 1.70e-15 ***
## chlorides -2.473e-01 5.465e-01 -0.452 0.65097
## citric.acid 2.209e-02 9.577e-02 0.231 0.81759
## density -1.503e+02 1.907e+01 -7.879 4.04e-15 ***
## fixed.acidity 6.552e-02 2.087e-02 3.139 0.00171 **
## free.sulfur.dioxide 3.733e-03 8.441e-04 4.422 9.99e-06 ***
## pH 6.863e-01 1.054e-01 6.513 8.10e-11 ***
## residual.sugar 8.148e-02 7.527e-03 10.825 < 2e-16 ***
## sulphates 6.315e-01 1.004e-01 6.291 3.44e-10 ***
## total.sulfur.dioxide -2.857e-04 3.781e-04 -0.756 0.44979
## volatile.acidity -1.863e+00 1.138e-01 -16.373 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2803
## F-statistic: 174.3 on 11 and 4886 DF, p-value: < 2.2e-16
The logged variables seem to matter.
RegModel.2 <-
lm(quality~alcohol+chlorides+citric.acid+fixed.acidity.log+free.sulf.diox.log+pH+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
data=White_wines)
summary(RegModel.2)
##
## Call:
## lm(formula = quality ~ alcohol + chlorides + citric.acid + fixed.acidity.log +
## free.sulf.diox.log + pH + residual.sugar + sulphates + total.sulfur.dioxide +
## volatile.acidity, data = White_wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3168 -0.5017 -0.0267 0.4521 3.1138
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3159630 0.4162258 3.162 0.00158 **
## alcohol 0.3505671 0.0111413 31.465 < 2e-16 ***
## chlorides -1.0109423 0.5342996 -1.892 0.05854 .
## citric.acid -0.0418197 0.0946938 -0.442 0.65878
## fixed.acidity.log -0.1341510 0.0709588 -1.891 0.05874 .
## free.sulf.diox.log 0.2386673 0.0176120 13.551 < 2e-16 ***
## pH 0.1830373 0.0815589 2.244 0.02486 *
## residual.sugar 0.0227248 0.0025233 9.006 < 2e-16 ***
## sulphates 0.4414986 0.0958906 4.604 4.25e-06 ***
## total.sulfur.dioxide -0.0024369 0.0003636 -6.702 2.29e-11 ***
## volatile.acidity -1.7368228 0.1129034 -15.383 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.745 on 4887 degrees of freedom
## Multiple R-squared: 0.2939, Adjusted R-squared: 0.2924
## F-statistic: 203.4 on 10 and 4887 DF, p-value: < 2.2e-16
Interpretation: for each increase in parent height of 1 inch, the child height increases by 0.65 inches. With density dropped, there are no anomolous standard errors. R-squared decreases slightly, but not enough to be practically significant.
RegModel.3 <-
lm(quality~alcohol+chlorides+fixed.acidity.log+free.sulf.diox.log+pH+residual.sugar+sulphates+total.sulfur.dioxide+volatile.acidity,
data=White_wines)
summary(RegModel.3)
##
## Call:
## lm(formula = quality ~ alcohol + chlorides + fixed.acidity.log +
## free.sulf.diox.log + pH + residual.sugar + sulphates + total.sulfur.dioxide +
## volatile.acidity, data = White_wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3099 -0.5021 -0.0274 0.4508 3.1118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3230570 0.4158815 3.181 0.00148 **
## alcohol 0.3502022 0.0111097 31.522 < 2e-16 ***
## chlorides -1.0387422 0.5305348 -1.958 0.05030 .
## fixed.acidity.log -0.1416017 0.0689184 -2.055 0.03997 *
## free.sulf.diox.log 0.2385761 0.0176094 13.548 < 2e-16 ***
## pH 0.1848631 0.0814473 2.270 0.02327 *
## residual.sugar 0.0226647 0.0025194 8.996 < 2e-16 ***
## sulphates 0.4389541 0.0957094 4.586 4.62e-06 ***
## total.sulfur.dioxide -0.0024471 0.0003629 -6.744 1.72e-11 ***
## volatile.acidity -1.7284995 0.1113101 -15.529 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7449 on 4888 degrees of freedom
## Multiple R-squared: 0.2938, Adjusted R-squared: 0.2925
## F-statistic: 226 on 9 and 4888 DF, p-value: < 2.2e-16
Let’s see what happens without density and then citric acid.
# compare the results of the two regression models
stargazer(RegModel.1,RegModel.2,RegModel.3,title="Comparison of 2 Regression outputs",type="html",align=TRUE)
| Dependent variable: | |||
| quality | |||
| (1) | (2) | (3) | |
| alcohol | 0.198*** | 0.351*** | 0.350*** |
| (0.024) | (0.011) | (0.011) | |
| chlorides | -0.404 | -1.011* | -1.039* |
| (0.538) | (0.534) | (0.531) | |
| citric.acid | 0.004 | -0.042 | |
| (0.094) | (0.095) | ||
| density.log | -93.679*** | ||
| (13.113) | |||
| fixed.acidity.log | 0.370*** | -0.134* | -0.142** |
| (0.100) | (0.071) | (0.069) | |
| free.sulf.diox.log | 0.216*** | 0.239*** | 0.239*** |
| (0.018) | (0.018) | (0.018) | |
| pH | 0.661*** | 0.183** | 0.185** |
| (0.105) | (0.082) | (0.081) | |
| residual.sugar | 0.073*** | 0.023*** | 0.023*** |
| (0.007) | (0.003) | (0.003) | |
| sulphates | 0.630*** | 0.441*** | 0.439*** |
| (0.099) | (0.096) | (0.096) | |
| total.sulfur.dioxide | -0.002*** | -0.002*** | -0.002*** |
| (0.0004) | (0.0004) | (0.0004) | |
| volatile.acidity | -1.651*** | -1.737*** | -1.728*** |
| (0.113) | (0.113) | (0.111) | |
| Constant | -1.249** | 1.316*** | 1.323*** |
| (0.548) | (0.416) | (0.416) | |
| Observations | 4,898 | 4,898 | 4,898 |
| R2 | 0.301 | 0.294 | 0.294 |
| Adjusted R2 | 0.300 | 0.292 | 0.293 |
| Residual Std. Error | 0.741 (df = 4886) | 0.745 (df = 4887) | 0.745 (df = 4888) |
| F Statistic | 191.408*** (df = 11; 4886) | 203.363*** (df = 10; 4887) | 225.974*** (df = 9; 4888) |
| Note: | p<0.1; p<0.05; p<0.01 | ||
# diagnostics using residual plots
residualPlots(RegModel.1)
## Test stat Pr(>|t|)
## alcohol 5.270 0.000
## chlorides 1.403 0.161
## citric.acid -4.424 0.000
## density.log 5.229 0.000
## fixed.acidity.log -3.103 0.002
## free.sulf.diox.log -11.193 0.000
## pH 0.964 0.335
## residual.sugar 2.481 0.013
## sulphates 0.047 0.963
## total.sulfur.dioxide -8.039 0.000
## volatile.acidity 3.210 0.001
## Tukey test 2.656 0.008
residualPlots(RegModel.2)
## Test stat Pr(>|t|)
## alcohol 5.440 0.000
## chlorides 2.085 0.037
## citric.acid -4.385 0.000
## fixed.acidity.log -3.854 0.000
## free.sulf.diox.log -10.941 0.000
## pH 0.287 0.774
## residual.sugar -1.326 0.185
## sulphates 0.051 0.959
## total.sulfur.dioxide -7.709 0.000
## volatile.acidity 1.905 0.057
## Tukey test 1.316 0.188
residualPlots(RegModel.3)
## Test stat Pr(>|t|)
## alcohol 5.393 0.000
## chlorides 2.077 0.038
## fixed.acidity.log -3.864 0.000
## free.sulf.diox.log -10.951 0.000
## pH 0.279 0.780
## residual.sugar -1.343 0.179
## sulphates 0.052 0.958
## total.sulfur.dioxide -7.711 0.000
## volatile.acidity 1.911 0.056
## Tukey test 1.312 0.190
White_wines$tot.sulf.diox.log <- with(White_wines,
log2(total.sulfur.dioxide))
RegModel.4 <-
lm(quality~alcohol+chlorides+fixed.acidity.log+free.sulf.diox.log+pH+residual.sugar+sulphates+tot.sulf.diox.log+volatile.acidity,
data=White_wines)
summary(RegModel.4)
##
## Call:
## lm(formula = quality ~ alcohol + chlorides + fixed.acidity.log +
## free.sulf.diox.log + pH + residual.sugar + sulphates + tot.sulf.diox.log +
## volatile.acidity, data = White_wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4389 -0.5015 -0.0234 0.4476 3.1550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.014293 0.434549 4.635 3.66e-06 ***
## alcohol 0.363387 0.011044 32.903 < 2e-16 ***
## chlorides -1.103123 0.532734 -2.071 0.03844 *
## fixed.acidity.log -0.182052 0.069065 -2.636 0.00842 **
## free.sulf.diox.log 0.206141 0.018166 11.348 < 2e-16 ***
## pH 0.152857 0.081812 1.868 0.06177 .
## residual.sugar 0.021325 0.002524 8.447 < 2e-16 ***
## sulphates 0.392168 0.095962 4.087 4.45e-05 ***
## tot.sulf.diox.log -0.103892 0.032194 -3.227 0.00126 **
## volatile.acidity -1.829711 0.110961 -16.490 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7476 on 4888 degrees of freedom
## Multiple R-squared: 0.2888, Adjusted R-squared: 0.2875
## F-statistic: 220.5 on 9 and 4888 DF, p-value: < 2.2e-16
residualPlots(RegModel.4)
## Test stat Pr(>|t|)
## alcohol 5.232 0.000
## chlorides 2.258 0.024
## fixed.acidity.log -3.871 0.000
## free.sulf.diox.log -11.958 0.000
## pH 0.352 0.725
## residual.sugar -1.086 0.277
## sulphates 0.383 0.702
## tot.sulf.diox.log -8.952 0.000
## volatile.acidity 2.338 0.019
## Tukey test 1.060 0.289
RegModel.5 <-
lm(quality~alcohol+chlorides+fixed.acidity.log+pH+residual.sugar+sulphates+volatile.acidity,
data=White_wines)
summary(RegModel.5)
##
## Call:
## lm(formula = quality ~ alcohol + chlorides + fixed.acidity.log +
## pH + residual.sugar + sulphates + volatile.acidity, data = White_wines)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4166 -0.4956 -0.0341 0.4640 3.1425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.498311 0.414340 6.030 1.76e-09 ***
## alcohol 0.364534 0.010735 33.957 < 2e-16 ***
## chlorides -0.876093 0.539098 -1.625 0.104204
## fixed.acidity.log -0.268134 0.069156 -3.877 0.000107 ***
## pH 0.164578 0.082277 2.000 0.045524 *
## residual.sugar 0.027958 0.002453 11.400 < 2e-16 ***
## sulphates 0.415638 0.096470 4.308 1.68e-05 ***
## volatile.acidity -2.078582 0.109253 -19.025 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7587 on 4890 degrees of freedom
## Multiple R-squared: 0.2671, Adjusted R-squared: 0.2661
## F-statistic: 254.6 on 7 and 4890 DF, p-value: < 2.2e-16
residualPlots(RegModel.5)
## Test stat Pr(>|t|)
## alcohol 5.776 0.000
## chlorides 2.403 0.016
## fixed.acidity.log -4.494 0.000
## pH -0.300 0.764
## residual.sugar -2.426 0.015
## sulphates 0.482 0.630
## volatile.acidity 2.319 0.020
## Tukey test 0.087 0.930
Lose too much R-squared. Stick with 4
#added variable plots
avPlots(RegModel.4, id.n=3, id.cex=0.7)
#id.n - identify n most influential observations
#id.cex - controls the size of the dot
# run the qq-plot
qqPlot(RegModel.4, id.n=3)
## 4746 3308 775
## 1 2 4898
# here, id.n identifies the n observations with the largest residuals in absolute value
Are there any outliers?
#run Bonferroni test for outliers
outlierTest(RegModel.4)
## rstudent unadjusted p-value Bonferonni p
## 4746 -4.622511 3.8889e-06 0.019048
## 3308 -4.498933 6.9880e-06 0.034227
Are there any points that are of high influence?
#identify highly influential points
influenceIndexPlot(RegModel.4, id.n=3)
NB. If there are points that are a) outliers AND b) highly influential, these have potential to change the inference. You should consider removing them.
How do we make heads or tails out of the plots above? One way is with an influence plot.
#make influence plot
influencePlot(RegModel.4, id.n=3)
## StudRes Hat CookD
## 485 -0.46699974 0.044148299 1.007456e-03
## 741 -4.21888688 0.009063432 1.622380e-02
## 775 4.23185426 0.002029076 3.628626e-03
## 1218 0.82199081 0.032476245 2.268125e-03
## 2782 0.09801707 0.056049846 5.705804e-05
## 3308 -4.49893274 0.003901068 7.895759e-03
## 3902 -2.76136044 0.018625734 1.445229e-02
## 4746 -4.62251086 0.005604332 1.199263e-02
Another diagnostic is to test for heteroskedasticity (i.e., the variance of the error term is not constant).
#test for heteroskedasticity
ncvTest(RegModel.4)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 12.49315 Df = 1 p = 0.0004084476
We also want to look for multicollinearity, that is are some of our independent variables highly correlated. We do this by looking at the Variance Inflation Factor (VIF). A GVIF > 4 suggests collinearity.
vif(RegModel.4)
## alcohol chlorides fixed.acidity.log
## 1.618513 1.186995 1.285326
## free.sulf.diox.log pH residual.sugar
## 1.804186 1.337197 1.436486
## sulphates tot.sulf.diox.log volatile.acidity
## 1.050918 2.135394 1.096020